url <- "https://raw.githubusercontent.com/DJScully02/Personal_Portfolio/main/data/Meter%20Data.xlsx"

# Create a temporary file
temp_file <- tempfile(fileext = ".xlsx")

# Download the file
curl_download(url, temp_file)

meter <- read_excel(temp_file, sheet = "D")

# Clean up the temporary file
unlink(temp_file)
print(meter)
## # A tibble: 180 × 44
##    Profile_Factor Symmetry Crossflow Flow_Velocity1 Flow_Velocity2
##             <dbl>    <dbl>     <dbl>          <dbl>          <dbl>
##  1           1.10    1.00      0.995           2.35           2.60
##  2           1.09    0.998     0.998           3.40           3.71
##  3           1.08    1.01      0.999           3.44           3.71
##  4           1.09    1.01      0.995           3.40           3.72
##  5           1.09    1.01      1.00            3.40           3.70
##  6           1.08    1.00      0.989           3.39           3.72
##  7           1.10    1.00      1.02            3.40           3.70
##  8           1.09    1.00      0.994           4.13           4.52
##  9           1.08    1.01      1.00            5.24           5.63
## 10           1.09    1.00      1.00            5.77           6.27
## # ℹ 170 more rows
## # ℹ 39 more variables: Flow_Velocity3 <dbl>, Flow_Velocity4 <dbl>,
## #   Speed_of_Sound1 <dbl>, Speed_of_Sound2 <dbl>, Speed_of_Sound3 <dbl>,
## #   Speed_of_Sound4 <dbl>, Signal_Strength1 <dbl>, Signal_Strength2 <dbl>,
## #   Signal_Strength3 <dbl>, Signal_Strength4 <dbl>, Signal_Strength5 <dbl>,
## #   Signal_Strength6 <dbl>, Signal_Strength7 <dbl>, Signal_Strength8 <dbl>,
## #   Signal_Quality1 <dbl>, Signal_Quality2 <dbl>, Signal_Quality3 <dbl>, …
set.seed(123) # Set seed for reproducibility

v_keep <- c(
  1:43
)
M <- meter[,v_keep]
M
## # A tibble: 180 × 43
##    Profile_Factor Symmetry Crossflow Flow_Velocity1 Flow_Velocity2
##             <dbl>    <dbl>     <dbl>          <dbl>          <dbl>
##  1           1.10    1.00      0.995           2.35           2.60
##  2           1.09    0.998     0.998           3.40           3.71
##  3           1.08    1.01      0.999           3.44           3.71
##  4           1.09    1.01      0.995           3.40           3.72
##  5           1.09    1.01      1.00            3.40           3.70
##  6           1.08    1.00      0.989           3.39           3.72
##  7           1.10    1.00      1.02            3.40           3.70
##  8           1.09    1.00      0.994           4.13           4.52
##  9           1.08    1.01      1.00            5.24           5.63
## 10           1.09    1.00      1.00            5.77           6.27
## # ℹ 170 more rows
## # ℹ 38 more variables: Flow_Velocity3 <dbl>, Flow_Velocity4 <dbl>,
## #   Speed_of_Sound1 <dbl>, Speed_of_Sound2 <dbl>, Speed_of_Sound3 <dbl>,
## #   Speed_of_Sound4 <dbl>, Signal_Strength1 <dbl>, Signal_Strength2 <dbl>,
## #   Signal_Strength3 <dbl>, Signal_Strength4 <dbl>, Signal_Strength5 <dbl>,
## #   Signal_Strength6 <dbl>, Signal_Strength7 <dbl>, Signal_Strength8 <dbl>,
## #   Signal_Quality1 <dbl>, Signal_Quality2 <dbl>, Signal_Quality3 <dbl>, …
v <- c(3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3)
initial_centers <- matrix(
  data = c(v,-v,abs(v),-abs(v)),
  ncol = 43,
  nrow = 3,
  byrow = TRUE
)
## Warning in matrix(data = c(v, -v, abs(v), -abs(v)), ncol = 43, nrow = 3, : data
## length [180] is not a sub-multiple or multiple of the number of columns [43]
options(digits = 2)
expand.grid(c(-3,3),c(-3,3),c(-3,3))
##   Var1 Var2 Var3
## 1   -3   -3   -3
## 2    3   -3   -3
## 3   -3    3   -3
## 4    3    3   -3
## 5   -3   -3    3
## 6    3   -3    3
## 7   -3    3    3
## 8    3    3    3
# Scale the data
scale_M <- scale(M)

prcomp_M <- prcomp(scale_M)

K-means

kmeans_meter_1 <- kmeans(
  x = scale_M,
  centers = initial_centers
)
kmeans_meter_1
## K-means clustering with 3 clusters of sizes 39, 12, 129
## 
## Cluster means:
##   Profile_Factor Symmetry Crossflow Flow_Velocity1 Flow_Velocity2
## 1          1.122    0.335      1.42           0.34           0.30
## 2          0.083   -0.515     -1.52          -2.28          -2.36
## 3         -0.347   -0.053     -0.29           0.11           0.13
##   Flow_Velocity3 Flow_Velocity4 Speed_of_Sound1 Speed_of_Sound2 Speed_of_Sound3
## 1           0.26          -1.26            0.76           0.415            0.85
## 2          -2.17          -1.42            1.07          -1.682           -1.06
## 3           0.12           0.51           -0.33           0.031           -0.16
##   Speed_of_Sound4 Signal_Strength1 Signal_Strength2 Signal_Strength3
## 1            1.61            -0.71            -0.70            -0.38
## 2            0.51            -2.44            -2.43            -3.12
## 3           -0.53             0.44             0.44             0.41
##   Signal_Strength4 Signal_Strength5 Signal_Strength6 Signal_Strength7
## 1            -0.38            -0.63            -0.63            -1.57
## 2            -3.11            -2.39            -2.38            -1.57
## 3             0.41             0.41             0.41             0.62
##   Signal_Strength8 Signal_Quality1 Signal_Quality2 Signal_Quality3
## 1            -1.57           -0.21           -0.24            0.43
## 2            -1.57           -3.19           -3.15           -3.26
## 3             0.62            0.36            0.37            0.18
##   Signal_Quality4 Signal_Quality5 Signal_Quality6 Signal_Quality7
## 1            0.56           -0.83           -0.80           -1.62
## 2           -3.31           -2.47           -2.41           -1.46
## 3            0.14            0.48            0.47            0.62
##   Signal_Quality8 Gain1 Gain2 Gain3 Gain4 Gain5 Gain6 Gain7 Gain8 Transit_Time1
## 1           -1.60  0.62  0.62  0.40  0.40  0.72  0.72  1.57  1.57         -0.38
## 2           -1.48  2.32  2.32  2.83  2.82  2.32  2.32  1.58  1.58         -2.04
## 3            0.62 -0.40 -0.40 -0.38 -0.38 -0.43 -0.43 -0.62 -0.62          0.30
##   Transit_Time2 Transit_Time3 Transit_Time4 Transit_Time5 Transit_Time6
## 1         -0.81       -0.1808         -0.51        -0.653        -0.774
## 2          1.01        0.0073          3.21        -0.047         1.906
## 3          0.15        0.0540         -0.15         0.202         0.057
##   Transit_Time7 Transit_Time8
## 1         -1.48         -1.45
## 2         -0.74         -0.36
## 3          0.52          0.47
## 
## Clustering vector:
##   [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [38] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [75] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [112] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 1239 1076  756
##  (between_SS / total_SS =  60.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
factoextra::fviz_nbclust(
    x = scale_M,
    FUNcluster = kmeans,
    method = "wss",
  )

useful::plot.kmeans(
  x = kmeans_meter_1,
  data = scale_M  
)

cluster::clusplot(
  scale_M,
  kmeans_meter_1$cluster,
  color=TRUE,
  shade=TRUE,
  labels=2,
  lines=0,
  main = "k-means clusters"
)

fpc::plotcluster(
  x = scale_M,
  clvecd = kmeans_meter_1$cluster
)

Looking at our plots above we can see that the elbow on the scree plot shows that we should have three groups as they capture the majority of the explained variance and are therefore the best to utilize. Then we visualize what those three groups would look like. We are able to see that utilizing three groups we are able to see a clear distinction in the groups. We can also see from the plot with the confidence ellipses that they are well separated.

Hierarchical Clustering

# Define distance metrics and hierarchical clustering methods
v_dist <- c("euclidean", "manhattan", "minkowski", "canberra", "maximum", "binary")
v_hclust <- c("ward.D", "ward.D2", "complete", "average", "mcquitty", "median", "centroid", "single")

# Calculate distance matrices
dist_M <- lapply(v_dist, function(y) dist(scale(M), method = y))
names(dist_M) <- v_dist

# Perform hierarchical clustering
L_hclust <- list()
for (j in v_dist) {
  for (k in v_hclust) {
    L_hclust[[j]][[k]] <- hclust(d = dist_M[[j]], method = k)
  }
}

# Initialize matrix to store coefficients
M_coef.hclust <- matrix(NA, nrow = length(v_dist), ncol = length(v_hclust))
rownames(M_coef.hclust) <- v_dist
colnames(M_coef.hclust) <- v_hclust

# Ensure heights are sorted and calculate coefficients
for (j in v_dist) {
  for (k in v_hclust) {
    hc <- L_hclust[[j]][[k]]
    if (is.unsorted(hc$height)) {
      hc$height <- sort(hc$height)
    }
    try({
      M_coef.hclust[j, k] <- coef.hclust(hc)
    }, silent = TRUE)
  }
}

# Remove rows and columns with all NA values before ordering and printing
M_coef.hclust <- M_coef.hclust[rowSums(is.na(M_coef.hclust)) != ncol(M_coef.hclust), ]
M_coef.hclust <- M_coef.hclust[, colSums(is.na(M_coef.hclust)) != nrow(M_coef.hclust)]

# Order and print coefficients matrix
M_coef.hclust <- M_coef.hclust[
  order(apply(X = M_coef.hclust, FUN = max, MARGIN = 1, na.rm = TRUE)),
  order(apply(X = M_coef.hclust, FUN = max, MARGIN = 2, na.rm = TRUE))
]

print(M_coef.hclust, digits = 3)
##           median single centroid mcquitty average complete ward.D2 ward.D
## maximum    0.936  0.943    0.943    0.940   0.942    0.950   0.961  0.991
## euclidean  0.919  0.928    0.942    0.937   0.943    0.945   0.984  0.996
## minkowski  0.919  0.928    0.942    0.937   0.943    0.945   0.984  0.996
## canberra   0.822  0.856    0.858    0.888   0.900    0.908   0.984  0.997
## manhattan  0.938  0.930    0.932    0.957   0.959    0.969   0.992  0.998
# Plot dendrograms
dev.new()
par(mfrow = c(1, 2))
plot(L_hclust[["euclidean"]][["ward.D"]])
plot(L_hclust[["euclidean"]][["single"]])
cutree(L_hclust[["euclidean"]][["ward.D"]], k = 4)
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [149] 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4

It appears that Euclidean, Minkowski, and Manhattan distances generally preform very well. We additionally observe that in general ward.D and ward.D2 consistently yield the highest coefficients across all of the distances we tested which indicates they provide the best clustering quality for our data set.

Below, we create additional plots to visualize the Euclidean distance. Although it doesn’t perform the best, as seen in the table above, it is the easiest for most people to understand. One of the most important aspects of data analysis and science is ensuring that your stakeholders can process and understand your methods and results.

silhouette_2 <- cluster::silhouette(
  cutree(L_hclust[["euclidean"]][["ward.D"]],k = 2),
  dist_M[["euclidean"]]
)
plot(silhouette_2)

silhouette_3 <- cluster::silhouette(
  cutree(L_hclust[["euclidean"]][["ward.D"]],k = 3),
  dist_M[["euclidean"]]
)
plot(silhouette_3)

silhouette_4 <- cluster::silhouette(
  cutree(L_hclust[["euclidean"]][["ward.D"]],k = 4),
  dist_M[["euclidean"]]
)
plot(silhouette_4)

silhouette_5 <- cluster::silhouette(
  cutree(L_hclust[["euclidean"]][["ward.D"]],k = 5),
  dist_M[["euclidean"]]
)
plot(silhouette_5)

plot(
  prcomp_M$x,
  col = cutree(L_hclust[["euclidean"]][["ward.D"]],k = 4) + 1,
  pch = 19,
  main = "Principal component plot of hierarchical clusters",
  sub = "You can see the 4 distinct groups from the hierarchical clusters"
)

cluster.stats_M <- fpc::cluster.stats(
  d = dist_M[["euclidean"]],
  clustering = cutree(L_hclust[["euclidean"]][["ward.D"]],k = 2),
  alt.clustering = cutree(L_hclust[["euclidean"]][["ward.D"]],k = 3),
  noisecluster = FALSE,
  silhouette = TRUE,
  G2 = TRUE,
  G3 = TRUE,
  wgap = TRUE,
  sepindex = TRUE,
  sepprob = 0.1,
  sepwithnoise = TRUE,
  compareonly = FALSE,
  aggregateonly = FALSE
)

print(cluster.stats_M)
## $n
## [1] 180
## 
## $cluster.number
## [1] 2
## 
## $cluster.size
## [1] 129  51
## 
## $min.cluster.size
## [1] 51
## 
## $noisen
## [1] 0
## 
## $diameter
## [1] 12 25
## 
## $average.distance
## [1]  2.8 11.0
## 
## $median.distance
## [1]  2.6 10.9
## 
## $separation
## [1] 6.3 6.3
## 
## $average.toother
## [1] 12 12
## 
## $separation.matrix
##      [,1] [,2]
## [1,]  0.0  6.3
## [2,]  6.3  0.0
## 
## $ave.between.matrix
##      [,1] [,2]
## [1,]    0   12
## [2,]   12    0
## 
## $average.between
## [1] 12
## 
## $average.within
## [1] 5.1
## 
## $n.between
## [1] 6579
## 
## $n.within
## [1] 9531
## 
## $max.diameter
## [1] 25
## 
## $min.separation
## [1] 6.3
## 
## $within.cluster.ss
## [1] 4632
## 
## $clus.avg.silwidths
##    1    2 
## 0.77 0.04 
## 
## $avg.silwidth
## [1] 0.56
## 
## $g2
## [1] 0.85
## 
## $g3
## [1] 0.1
## 
## $pearsongamma
## [1] 0.69
## 
## $dunn
## [1] 0.26
## 
## $dunn2
## [1] 1.1
## 
## $entropy
## [1] 0.6
## 
## $wb.ratio
## [1] 0.43
## 
## $ch
## [1] 118
## 
## $cwidegap
## [1]  7.5 16.2
## 
## $widestgap
## [1] 16
## 
## $sindex
## [1] 6.3
## 
## $corrected.rand
## [1] 0.92
## 
## $vi
## [1] 0.19

With \(k = 4\), we found the best silhouette width of \(0.59\), which indicates fairly good clustering quality. The principal component plot visually demonstrates how the hierarchical clustering categorizes the data into four distinct groups. This visualization helps to understand the distribution and separation of the clusters in the data set.

list_dist <- lapply(
  X = v_dist,
  FUN = function(distance_method) dist(
    x = scale_M,
    method = distance_method
  )
)
names(list_dist) <- v_dist
list_hclust <- list()
for(j in v_dist) for(k in v_hclust) list_hclust[[j]][[k]] <- hclust(
  d = list_dist[[j]],
  method = k
)
par(
  mfrow = c(length(v_dist),length(v_hclust)),
  mar = c(0,0,0,0),
  mai = c(0,0,0,0),
  oma = c(0,0,0,0)
)
for(j in v_dist) for(k in v_hclust) plot(
  x = list_hclust[[j]][[k]],
  labels = FALSE,
  axes = FALSE,
  main = paste("\n",j,"\n",k)
)

plot(
  x = list_hclust[["euclidean"]][["ward.D"]],
  main = "Euclidean Ward's D Hierarchical",
  sub = ""
)

plot(
  x = list_hclust[["euclidean"]][["single"]],
  main = "Euclidean Single Hierarchical",
  sub = "We can see a possible outlier in obs 170"
)

As stated above we went with the most frequently used Euclidean distance. We also decided to take a closer look at both single linkage and Wards D. Looking at our Single linkage it seems that observation 170 may be an outlier in the data set.So we may want to remove this value after reviewing it with more subject knowledge. The Silhouette for Wards D looks really good though so I would personally choose wards D after checking 170.

Model Based clustering

Mclust_meter <- Mclust(
  data = meter,
  G = 3
)
Mclust_meter
## 'Mclust' model object: (VEI,3) 
## 
## Available components: 
##  [1] "call"           "data"           "modelName"      "n"             
##  [5] "d"              "G"              "BIC"            "loglik"        
##  [9] "df"             "bic"            "icl"            "hypvol"        
## [13] "parameters"     "z"              "classification" "uncertainty"
summary(
  object = Mclust_meter
)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VEI (diagonal, equal shape) model with 3 components: 
## 
##  log-likelihood   n  df    BIC    ICL
##          -18437 180 180 -37809 -37809
## 
## Clustering table:
##  1  2  3 
## 52 58 70
logLik(
  object = Mclust_meter
)
## 'log Lik.' -18437 (df=180)
predict(
  object = Mclust_meter
)
## $classification
##   [1] 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 1
##  [38] 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 3 3 3
##  [75] 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 1
## [112] 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [149] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 
## $z
##              1        2       3
##   [1,] 1.0e+00  7.8e-87 5.1e-77
##   [2,] 1.0e+00  1.4e-47 6.2e-78
##   [3,] 1.0e+00  9.4e-48 1.3e-77
##   [4,] 1.0e+00  5.3e-50 4.3e-77
##   [5,] 1.0e+00  1.2e-49 1.3e-76
##   [6,] 1.0e+00  1.4e-50 7.5e-76
##   [7,] 1.0e+00  3.6e-52 6.0e-75
##   [8,] 1.0e+00  1.1e-32 8.7e-68
##   [9,] 1.0e+00  2.9e-05 6.6e-55
##  [10,] 7.3e-19  1.0e+00 5.0e-72
##  [11,] 4.2e-35  1.0e+00 6.6e-83
##  [12,] 1.0e-36  1.0e+00 1.1e-84
##  [13,] 3.0e-37  1.0e+00 5.6e-83
##  [14,] 4.0e-37  1.0e+00 1.5e-79
##  [15,] 3.7e-37  1.0e+00 2.0e-81
##  [16,] 7.3e-37  1.0e+00 1.4e-83
##  [17,] 1.3e-47  1.0e+00 4.4e-83
##  [18,] 2.2e-54  1.0e+00 6.7e-76
##  [19,] 1.0e+00  5.8e-84 2.7e-75
##  [20,] 1.0e+00  3.1e-50 1.9e-77
##  [21,] 1.0e+00  2.9e-51 6.8e-77
##  [22,] 1.0e+00  9.1e-52 7.7e-77
##  [23,] 1.0e+00  2.6e-51 8.3e-77
##  [24,] 1.0e+00  5.5e-52 5.6e-76
##  [25,] 1.0e+00  1.8e-51 1.3e-75
##  [26,] 1.0e+00  3.6e-27 3.3e-72
##  [27,] 4.7e-03  1.0e+00 2.6e-62
##  [28,] 9.0e-14  1.0e+00 4.4e-64
##  [29,] 1.9e-25  1.0e+00 4.9e-65
##  [30,] 1.1e-31  1.0e+00 7.2e-77
##  [31,] 5.0e-33  1.0e+00 9.0e-80
##  [32,] 4.5e-35  1.0e+00 2.3e-79
##  [33,] 6.4e-34  1.0e+00 3.7e-76
##  [34,] 1.3e-32  1.0e+00 2.1e-70
##  [35,] 3.2e-45  1.0e+00 8.5e-76
##  [36,] 5.7e-55  1.0e+00 2.1e-73
##  [37,] 1.0e+00  3.5e-88 8.7e-71
##  [38,] 1.0e+00  5.4e-48 6.1e-73
##  [39,] 1.0e+00  4.8e-49 2.1e-72
##  [40,] 1.0e+00  4.1e-49 1.8e-72
##  [41,] 1.0e+00  7.8e-49 5.1e-72
##  [42,] 1.0e+00  7.8e-27 2.3e-69
##  [43,] 3.7e-05  1.0e+00 8.6e-66
##  [44,] 1.8e-18  1.0e+00 7.7e-72
##  [45,] 2.1e-32  1.0e+00 3.0e-75
##  [46,] 1.9e-32  1.0e+00 4.0e-75
##  [47,] 6.4e-32  1.0e+00 4.1e-75
##  [48,] 4.0e-32  1.0e+00 1.5e-75
##  [49,] 2.5e-31  1.0e+00 1.9e-74
##  [50,] 4.4e-41  1.0e+00 1.4e-73
##  [51,] 6.3e-49  1.0e+00 6.8e-67
##  [52,] 0.0e+00  0.0e+00 1.0e+00
##  [53,] 0.0e+00  0.0e+00 1.0e+00
##  [54,] 0.0e+00  0.0e+00 1.0e+00
##  [55,] 0.0e+00  0.0e+00 1.0e+00
##  [56,] 0.0e+00  0.0e+00 1.0e+00
##  [57,] 0.0e+00  0.0e+00 1.0e+00
##  [58,] 0.0e+00  0.0e+00 1.0e+00
##  [59,] 0.0e+00  0.0e+00 1.0e+00
##  [60,] 0.0e+00  0.0e+00 1.0e+00
##  [61,] 0.0e+00  0.0e+00 1.0e+00
##  [62,] 0.0e+00  0.0e+00 1.0e+00
##  [63,] 0.0e+00  0.0e+00 1.0e+00
##  [64,] 0.0e+00  0.0e+00 1.0e+00
##  [65,] 0.0e+00  0.0e+00 1.0e+00
##  [66,] 0.0e+00  0.0e+00 1.0e+00
##  [67,] 0.0e+00  0.0e+00 1.0e+00
##  [68,] 1.0e+00 6.4e-124 2.8e-66
##  [69,] 1.0e+00 1.8e-127 1.3e-61
##  [70,] 1.0e+00 5.7e-138 4.8e-61
##  [71,] 1.0e+00 1.9e-130 5.8e-63
##  [72,] 5.2e-47 3.0e-239 1.0e+00
##  [73,] 0.0e+00  0.0e+00 1.0e+00
##  [74,] 0.0e+00  0.0e+00 1.0e+00
##  [75,] 1.0e+00  5.1e-83 7.4e-74
##  [76,] 1.0e+00  5.7e-49 1.8e-74
##  [77,] 1.0e+00  2.1e-49 1.4e-73
##  [78,] 1.0e+00  1.5e-53 2.8e-69
##  [79,] 1.0e+00  2.7e-55 2.1e-68
##  [80,] 1.0e+00  1.5e-54 1.6e-67
##  [81,] 1.0e+00  2.3e-45 2.3e-76
##  [82,] 1.0e+00  1.3e-17 3.0e-72
##  [83,] 1.1e-11  1.0e+00 3.4e-66
##  [84,] 5.7e-24  1.0e+00 3.9e-74
##  [85,] 3.2e-38  1.0e+00 4.8e-83
##  [86,] 1.1e-37  1.0e+00 2.3e-84
##  [87,] 7.4e-37  1.0e+00 9.6e-85
##  [88,] 5.3e-36  1.0e+00 3.0e-84
##  [89,] 3.5e-35  1.0e+00 4.7e-83
##  [90,] 9.0e-35  1.0e+00 5.8e-82
##  [91,] 4.1e-45  1.0e+00 2.0e-77
##  [92,] 1.0e-51  1.0e+00 1.3e-69
##  [93,] 1.0e+00  1.1e-87 2.1e-73
##  [94,] 1.0e+00  1.5e-49 1.3e-75
##  [95,] 1.0e+00  3.5e-45 8.5e-80
##  [96,] 1.0e+00  8.6e-44 3.7e-80
##  [97,] 1.0e+00  2.2e-43 1.2e-77
##  [98,] 1.0e+00  1.1e-42 3.4e-73
##  [99,] 1.0e+00  4.3e-47 1.6e-67
## [100,] 1.0e+00  1.3e-21 1.4e-65
## [101,] 2.3e-08  1.0e+00 1.9e-66
## [102,] 5.6e-24  1.0e+00 8.0e-76
## [103,] 7.9e-37  1.0e+00 3.6e-81
## [104,] 2.1e-36  1.0e+00 5.4e-83
## [105,] 5.2e-37  1.0e+00 4.5e-84
## [106,] 1.2e-36  1.0e+00 1.0e-84
## [107,] 9.5e-37  1.0e+00 3.7e-85
## [108,] 9.5e-36  1.0e+00 3.4e-84
## [109,] 1.3e-47  1.0e+00 2.6e-81
## [110,] 2.3e-55  1.0e+00 4.3e-75
## [111,] 1.0e+00  3.5e-88 4.3e-64
## [112,] 1.0e+00  6.0e-88 1.3e-65
## [113,] 1.0e+00  8.0e-50 2.8e-69
## [114,] 1.0e+00  3.7e-48 1.3e-70
## [115,] 1.0e+00  8.3e-49 1.0e-71
## [116,] 1.0e+00  6.9e-49 3.3e-72
## [117,] 1.0e+00  1.0e-49 1.3e-72
## [118,] 1.0e+00  1.1e-47 3.1e-73
## [119,] 1.0e+00  4.0e-21 1.8e-71
## [120,] 1.3e-09  1.0e+00 1.7e-71
## [121,] 1.5e-23  1.0e+00 6.7e-78
## [122,] 2.9e-37  1.0e+00 2.4e-81
## [123,] 1.4e-35  1.0e+00 8.4e-82
## [124,] 8.9e-36  1.0e+00 2.0e-82
## [125,] 3.3e-35  1.0e+00 7.9e-82
## [126,] 8.1e-36  1.0e+00 6.7e-83
## [127,] 5.2e-35  1.0e+00 1.2e-80
## [128,] 3.3e-44  1.0e+00 5.6e-78
## [129,] 1.8e-49  1.0e+00 3.5e-64
## [130,] 0.0e+00  0.0e+00 1.0e+00
## [131,] 0.0e+00  0.0e+00 1.0e+00
## [132,] 0.0e+00  0.0e+00 1.0e+00
## [133,] 0.0e+00  0.0e+00 1.0e+00
## [134,] 0.0e+00  0.0e+00 1.0e+00
## [135,] 0.0e+00  0.0e+00 1.0e+00
## [136,] 0.0e+00  0.0e+00 1.0e+00
## [137,] 0.0e+00  0.0e+00 1.0e+00
## [138,] 0.0e+00  0.0e+00 1.0e+00
## [139,] 0.0e+00  0.0e+00 1.0e+00
## [140,] 0.0e+00  0.0e+00 1.0e+00
## [141,] 0.0e+00  0.0e+00 1.0e+00
## [142,] 0.0e+00  0.0e+00 1.0e+00
## [143,] 0.0e+00  0.0e+00 1.0e+00
## [144,] 0.0e+00  0.0e+00 1.0e+00
## [145,] 0.0e+00  0.0e+00 1.0e+00
## [146,] 0.0e+00  0.0e+00 1.0e+00
## [147,] 0.0e+00  0.0e+00 1.0e+00
## [148,] 0.0e+00  0.0e+00 1.0e+00
## [149,] 0.0e+00  0.0e+00 1.0e+00
## [150,] 0.0e+00  0.0e+00 1.0e+00
## [151,] 0.0e+00  0.0e+00 1.0e+00
## [152,] 0.0e+00  0.0e+00 1.0e+00
## [153,] 0.0e+00  0.0e+00 1.0e+00
## [154,] 0.0e+00  0.0e+00 1.0e+00
## [155,] 0.0e+00  0.0e+00 1.0e+00
## [156,] 0.0e+00  0.0e+00 1.0e+00
## [157,] 0.0e+00  0.0e+00 1.0e+00
## [158,] 0.0e+00  0.0e+00 1.0e+00
## [159,] 0.0e+00  0.0e+00 1.0e+00
## [160,] 0.0e+00  0.0e+00 1.0e+00
## [161,] 0.0e+00  0.0e+00 1.0e+00
## [162,] 0.0e+00  0.0e+00 1.0e+00
## [163,] 0.0e+00  0.0e+00 1.0e+00
## [164,] 0.0e+00  0.0e+00 1.0e+00
## [165,] 0.0e+00  0.0e+00 1.0e+00
## [166,] 0.0e+00  0.0e+00 1.0e+00
## [167,] 0.0e+00  0.0e+00 1.0e+00
## [168,] 0.0e+00  0.0e+00 1.0e+00
## [169,] 0.0e+00  0.0e+00 1.0e+00
## [170,] 0.0e+00  0.0e+00 1.0e+00
## [171,] 0.0e+00  0.0e+00 1.0e+00
## [172,] 0.0e+00  0.0e+00 1.0e+00
## [173,] 0.0e+00  0.0e+00 1.0e+00
## [174,] 0.0e+00  0.0e+00 1.0e+00
## [175,] 0.0e+00  0.0e+00 1.0e+00
## [176,] 0.0e+00  0.0e+00 1.0e+00
## [177,] 0.0e+00  0.0e+00 1.0e+00
## [178,] 0.0e+00  0.0e+00 1.0e+00
## [179,] 0.0e+00  0.0e+00 1.0e+00
## [180,] 0.0e+00  0.0e+00 1.0e+00
plot(
  x = Mclust_meter,
  what = "BIC",
  sub = "We can see that EII has the lowest BIC"
)

plot(
  x = Mclust_meter,
  what = "classification"
)

plot(
  x = Mclust_meter,
  what = "uncertainty"
)

v_emModelNames <- mclust.options("emModelNames")
names(v_emModelNames) <- v_emModelNames
for(j in v_emModelNames) v_emModelNames[j] <- mclustModelNames(j)$type
data.frame(
  model = v_emModelNames
)
##                                                   model
## EII                             spherical, equal volume
## VII                           spherical, varying volume
## EEI                    diagonal, equal volume and shape
## VEI                               diagonal, equal shape
## EVI               diagonal, equal volume, varying shape
## VVI                  diagonal, varying volume and shape
## EEE    ellipsoidal, equal volume, shape and orientation
## VEE            ellipsoidal, equal shape and orientation
## EVE           ellipsoidal, equal volume and orientation
## VVE                      ellipsoidal, equal orientation
## EEV                 ellipsoidal, equal volume and shape
## VEV                            ellipsoidal, equal shape
## EVV                           ellipsoidal, equal volume
## VVV ellipsoidal, varying volume, shape, and orientation
list_Mclust <- list()
for(j in names(v_emModelNames)){
  list_Mclust[[j]] <- Mclust(
    data = meter,
    modelNames = j
  )  
  plot(
    x = list_Mclust[[j]],
    what = "classification",
    sub = paste0("Number of clusters = ",list_Mclust[[j]]$G,", BIC = ",round(list_Mclust[[j]]$bic))
  )
  title(
    main = paste(
      "Model based clustering of meter data\n",j
    )
  )
}

hc_meter <- hc(
  data = meter,
  modelName = "EII"
)
plot(
  x = hc_meter
)

We suggest to utilze EII for the model. It has the lowest BIC value and the hierarchical cluster looks good.

Variable clustering

Transpose Method

t_M <- t(M)
v_dist <- c("euclidean","maximum","manhattan","canberra","binary","minkowski")
v_hclust <- c("ward.D","ward.D2","single","complete","average","mcquitty","median","centroid")

M_coef <- matrix(
  data = NA,
  nrow = length(v_dist),
  ncol = length(v_hclust),
  dimnames = list(v_dist,v_hclust)
)
list_dist <- lapply(
  X = v_dist,
  FUN = function(x) dist(t_M,method = x)
)
names(list_dist) <- v_dist
list_hclust <- list()
for(j in v_dist) for(k in v_hclust) list_hclust[[j]][[k]] <- hclust(
  d = list_dist[[j]],
  method = k
)
for(j in v_dist) for(k in v_hclust) try({
  list_hclust[[j]][[k]]$height <- rank(list_hclust[[j]][[k]]$height)
  M_coef[j,k] <- cluster::coef.hclust(list_hclust[[j]][[k]])
})
## Error in cluster::coef.hclust(list_hclust[[j]][[k]]) : 
##   !is.unsorted(ht) is not TRUE
## Error in cluster::coef.hclust(list_hclust[[j]][[k]]) : 
##   !is.unsorted(ht) is not TRUE
## Error in cluster::coef.hclust(list_hclust[[j]][[k]]) : 
##   !is.unsorted(ht) is not TRUE
## Error in cluster::coef.hclust(list_hclust[[j]][[k]]) : 
##   !is.unsorted(ht) is not TRUE
## Error in cluster::coef.hclust(list_hclust[[j]][[k]]) : 
##   !is.unsorted(ht) is not TRUE
## Error in cluster::coef.hclust(list_hclust[[j]][[k]]) : 
##   !is.unsorted(ht) is not TRUE
## Error in cluster::coef.hclust(list_hclust[[j]][[k]]) : 
##   !is.unsorted(ht) is not TRUE
## Error in cluster::coef.hclust(list_hclust[[j]][[k]]) : 
##   !is.unsorted(ht) is not TRUE
for(j in v_dist) for(k in v_hclust) M_coef <- M_coef[order(M_coef[,k]),order(M_coef[j,])]
M_coef
##           single mcquitty average complete ward.D2 ward.D median centroid
## binary      0.50     0.50    0.50     0.50    0.50   0.50   0.50     0.50
## manhattan   0.63     0.63    0.63     0.63    0.64   0.65   0.63     0.63
## maximum     0.60     0.60    0.60     0.61    0.61   0.63     NA       NA
## euclidean   0.62     0.62    0.62     0.62    0.63   0.64     NA       NA
## minkowski   0.62     0.62    0.62     0.62    0.63   0.64     NA       NA
## canberra    0.66     0.67    0.67     0.67    0.68   0.68     NA       NA
par(mfrow = 1:2)
plot(list_hclust[["maximum"]][["ward.D"]])
plot(list_hclust[["canberra"]][["single"]])

kmeans_M <- kmeans(
  x = t_M,
  centers = 4
)
kmeans_M
## K-means clustering with 4 clusters of sizes 8, 12, 16, 7
## 
## Cluster means:
##      [,1]   [,2]   [,3]   [,4]   [,5]   [,6]   [,7]   [,8]   [,9]  [,10]  [,11]
## 1    0.35    0.3    0.3    0.3    0.3    0.3    0.3    0.3    0.3    0.3    0.3
## 2 2863.95 2839.4 2839.9 2842.2 2838.4 2843.6 2843.0 2833.6 2815.5 2801.2 2791.4
## 3  101.75  101.7  101.7  101.7  101.7  101.7  101.7  101.7  101.6  101.7  101.8
## 4    1.85    2.5    2.5    2.5    2.5    2.5    2.5    2.9    3.5    3.9    4.2
##    [,12]  [,13]  [,14]  [,15]  [,16]  [,17]  [,18]   [,19]   [,20]   [,21]
## 1    0.3    0.3    0.3    0.3    0.3    0.3    0.3    0.27    0.27    0.27
## 2 2783.8 2787.1 2783.0 2782.8 2787.6 2776.5 2765.4 2849.44 2836.97 2836.53
## 3  101.8  101.9  101.9  101.9  101.9  101.8  101.8  101.80  101.79  101.77
## 4    4.2    4.2    4.2    4.2    4.2    4.5    4.8    1.87    2.44    2.43
##     [,22]   [,23]   [,24]   [,25]   [,26]   [,27]   [,28]   [,29]   [,30]
## 1    0.27    0.27    0.27    0.27    0.27    0.27    0.27    0.27    0.27
## 2 2838.30 2837.53 2834.54 2836.27 2825.53 2807.97 2796.36 2785.42 2780.70
## 3  101.76  101.75  101.75  101.74  101.73  101.70  101.69  101.68  101.75
## 4    2.43    2.43    2.42    2.45    2.92    3.57    3.85    4.16    4.15
##     [,31]   [,32]   [,33]   [,34]   [,35]   [,36]   [,37]   [,38]   [,39]
## 1    0.27    0.27    0.27    0.27    0.27    0.27    0.45    0.45    0.45
## 2 2779.99 2779.51 2780.87 2780.27 2768.75 2756.37 2891.00 2878.53 2882.24
## 3  101.80  101.85  101.89  101.92  101.87  101.81  101.72  101.73  101.73
## 4    4.14    4.17    4.15    4.17    4.48    4.81    1.82    2.50    2.50
##     [,40]   [,41]   [,42]   [,43]   [,44]   [,45]   [,46]   [,47]   [,48]
## 1    0.45    0.45    0.45    0.45    0.45    0.45    0.45    0.45    0.45
## 2 2881.35 2880.31 2872.91 2849.04 2841.37 2827.58 2827.02 2829.79 2828.78
## 3  101.74  101.74  101.74  101.73  101.72  101.71  101.70  101.69  101.68
## 4    2.49    2.49    2.92    3.51    3.83    4.19    4.19    4.19    4.20
##     [,49]   [,50]   [,51]  [,52]  [,53]  [,54]  [,55]  [,56]  [,57]  [,58]
## 1    0.45    0.45    0.45    2.2    3.9    3.3    3.5    4.8    2.9    4.6
## 2 2830.75 2814.15 2806.80 2754.3 2711.4 2742.4 2722.9 2852.2 2720.8 2568.3
## 3  101.67  101.65  101.62  102.6  102.8  102.3  102.3  102.5  102.7  102.6
## 4    4.19    4.47    4.78    2.1    2.1    2.1    2.1    2.1    3.6    3.6
##    [,59]  [,60]  [,61]  [,62]  [,63]  [,64]  [,65]  [,66]  [,67]   [,68]
## 1   11.7    4.9    3.1   20.4   27.6    5.4    8.9   19.9   32.7    0.47
## 2 2656.6 2654.7 2707.5 2828.3 2588.8 2984.7 2964.5 3024.1 2448.0 2866.69
## 3  102.1  102.4  101.9  103.6  102.0  102.8  102.9  101.6  101.1  101.77
## 4    3.6    3.6    3.6    3.5    2.6    5.2    5.2    5.2    4.8    1.21
##     [,69]   [,70]   [,71]  [,72]  [,73]  [,74]  [,75]  [,76]  [,77]  [,78]
## 1    0.47    0.53    0.51    1.7    2.9    6.0    0.2    0.2    0.2    0.2
## 2 2883.07 2898.33 2878.79 2751.6 2769.3 2678.4 2820.7 2811.2 2810.7 2810.9
## 3  101.70  101.80  101.67  102.0  102.0  102.1  101.7  101.7  101.7  101.7
## 4    1.19    1.19    1.17    1.2    1.2    1.1    1.9    2.5    2.5    2.5
##    [,79]  [,80]  [,81]  [,82]  [,83]   [,84]  [,85]  [,86]  [,87]  [,88]  [,89]
## 1    0.2    0.2    0.2    0.2    0.2    0.21    0.3    0.3    0.3    0.3    0.3
## 2 2812.7 2808.4 2807.1 2796.9 2771.8 2767.28 2781.2 2785.5 2783.5 2786.2 2785.8
## 3  101.7  101.7  101.8  101.9  101.9  101.89  101.9  101.8  101.8  101.8  101.8
## 4    2.5    2.5    2.5    2.9    3.6    3.83    4.2    4.2    4.1    4.2    4.2
##    [,90]  [,91]  [,92]  [,93]  [,94]  [,95]  [,96]  [,97]  [,98]  [,99] [,100]
## 1    0.3    0.3    0.3    0.3    0.3    0.3    0.3    0.3    0.3    0.3    0.3
## 2 2788.5 2770.7 2756.5 2853.2 2841.0 2839.2 2840.4 2839.2 2833.9 2836.5 2825.0
## 3  101.8  101.7  101.7  101.7  101.7  101.8  101.8  101.9  101.9  102.0  102.0
## 4    4.2    4.5    4.8    1.8    2.5    2.5    2.5    2.5    2.5    2.5    2.9
##   [,101] [,102] [,103] [,104] [,105] [,106] [,107] [,108] [,109] [,110] [,111]
## 1    0.3    0.3    0.3    0.3    0.3    0.3    0.3    0.3    0.3    0.3    0.4
## 2 2809.8 2796.2 2786.0 2788.0 2787.9 2787.9 2787.0 2784.2 2774.0 2761.8 2863.3
## 3  101.9  101.9  101.9  101.9  101.8  101.8  101.8  101.8  101.8  101.8  102.0
## 4    3.5    3.9    4.2    4.2    4.2    4.2    4.2    4.1    4.5    4.8    1.8
##   [,112] [,113] [,114] [,115] [,116] [,117] [,118] [,119] [,120] [,121] [,122]
## 1    0.4    0.4    0.4    0.4    0.4    0.4    0.4    0.4    0.4    0.4    0.4
## 2 2862.8 2850.4 2851.4 2850.8 2854.0 2852.8 2847.3 2839.4 2816.7 2805.6 2785.9
## 3  102.0  101.9  101.9  101.9  101.9  101.9  101.9  101.9  101.9  101.9  101.8
## 4    1.8    2.4    2.5    2.5    2.5    2.4    2.4    2.9    3.5    3.9    4.2
##   [,123] [,124] [,125] [,126] [,127]  [,128]  [,129] [,130] [,131] [,132]
## 1    0.4    0.4    0.4    0.4    0.4    0.46    0.53     12   11.5   11.5
## 2 2792.6 2792.3 2792.5 2792.8 2790.0 2795.65 2795.02   2445 2459.2 2467.8
## 3  101.8  101.8  101.8  101.8  101.8  101.72  101.63    100   99.8   99.7
## 4    4.2    4.2    4.2    4.2    4.2    4.48    4.78      2    2.4    2.4
##   [,133] [,134] [,135] [,136] [,137] [,138] [,139] [,140] [,141] [,142] [,143]
## 1   11.5   11.4   11.4   11.4   11.4   11.4   11.4   11.4   11.4   11.4   11.4
## 2 2466.7 2457.7 2461.4 2456.5 2452.8 2435.2 2425.9 2408.7 2417.0 2422.1 2411.6
## 3   99.7   99.6   99.4   99.7   98.9   99.6   99.7   99.8   99.7   99.6   99.6
## 4    2.4    2.4    2.4    2.4    2.8    3.2    3.5    3.8    3.7    3.7    3.7
##   [,144] [,145] [,146] [,147] [,148] [,149] [,150] [,151] [,152] [,153] [,154]
## 1   11.4   11.4     11   11.5   11.5   11.5   38.0   37.9   37.7   37.4   37.1
## 2 2407.6 2407.0   2414 2435.8 2432.1 2498.0 2332.2 2278.4 2280.8 2281.1 2301.6
## 3   99.7   99.4     98   98.4   98.3   98.7   94.6   95.3   95.9   96.4   94.9
## 4    3.7    3.7      4    4.2    4.3    2.4    1.9    2.4    2.4    2.5    2.5
##   [,155] [,156] [,157] [,158] [,159] [,160] [,161] [,162] [,163] [,164] [,165]
## 1   37.0   37.0   36.9   36.6   36.5     37     37     37     37     37     37
## 2 2350.9 2376.3 2349.6 2304.5 2249.4   2232   2169   2158   2158   2179   2193
## 3   95.2   95.2   95.9   96.5   96.5     97     95     95     95     95     95
## 4    2.5    2.5    2.8    3.3    3.5      4      4      4      4      4      4
##   [,166] [,167] [,168]  [,169] [,170] [,171] [,172]  [,173]  [,174] [,175]
## 1   36.9   36.9   36.7   45.20   45.1     45   45.1   45.00   45.00   45.0
## 2 2245.7 2236.5 2220.3 1748.27 1708.4   1707 1716.5 1723.04 1709.07 1705.8
## 3   95.0   96.3   96.3   96.63   95.9     96   95.8   95.97   95.61   96.1
## 4    4.2    4.3    4.3   -0.66   -4.7      1    2.1    0.44   -0.59    1.5
##   [,176]  [,177]  [,178]  [,179]  [,180]
## 1   44.9   44.88   44.77   44.88   44.88
## 2 1689.2 1693.72 1700.96 1734.60 1716.56
## 3   95.0   95.26   95.62   96.54   96.54
## 4    1.1   -0.42   -0.41   -0.38   -0.44
## 
## Clustering vector:
##   Profile_Factor         Symmetry        Crossflow   Flow_Velocity1 
##                4                4                4                4 
##   Flow_Velocity2   Flow_Velocity3   Flow_Velocity4  Speed_of_Sound1 
##                4                4                4                2 
##  Speed_of_Sound2  Speed_of_Sound3  Speed_of_Sound4 Signal_Strength1 
##                2                2                2                3 
## Signal_Strength2 Signal_Strength3 Signal_Strength4 Signal_Strength5 
##                3                3                3                3 
## Signal_Strength6 Signal_Strength7 Signal_Strength8  Signal_Quality1 
##                3                3                3                2 
##  Signal_Quality2  Signal_Quality3  Signal_Quality4  Signal_Quality5 
##                2                2                2                2 
##  Signal_Quality6  Signal_Quality7  Signal_Quality8            Gain1 
##                2                2                2                1 
##            Gain2            Gain3            Gain4            Gain5 
##                1                1                1                1 
##            Gain6            Gain7            Gain8    Transit_Time1 
##                1                1                1                3 
##    Transit_Time2    Transit_Time3    Transit_Time4    Transit_Time5 
##                3                3                3                3 
##    Transit_Time6    Transit_Time7    Transit_Time8 
##                3                3                3 
## 
## Within cluster sum of squares by cluster:
## [1] 7.2e+04 1.7e+09 4.2e+06 7.4e+03
##  (between_SS / total_SS =  85.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
prcomp_t_M <- prcomp(t_M)
plot(
  x = prcomp_t_M$x[,1:2],
  col = kmeans_M$cluster,
  pch = "."
)
text(
  x = prcomp_t_M$x[,1:2],
  col = kmeans_M$cluster,
  labels = rownames(t_M)
)

ClustOfVar

hclustvar_M <- ClustOfVar::hclustvar(
  X.quanti = scale_M,
  init = c(1,1,1,1,2,2,2,3,3,3)
)
ClustOfVar::print.hclustvar(hclustvar_M)
## 
## Call:
## ClustOfVar::hclustvar(X.quanti = scale_M, init = c(1, 1, 1, 1,     2, 2, 2, 3, 3, 3))
## 
## 
## number of variables:  43
## number of objects:  180
## 
## available components: height clusmat merge
ClustOfVar::plot.hclustvar(hclustvar_M)

hclustvar_M <- ClustOfVar::hclustvar(
  X.quanti = scale_M
)
ClustOfVar::print.hclustvar(hclustvar_M)
## 
## Call:
## ClustOfVar::hclustvar(X.quanti = scale_M)
## 
## 
## number of variables:  43
## number of objects:  180
## 
## available components: height clusmat merge
ClustOfVar::plot.hclustvar(hclustvar_M, sub = "ClustOfVar Variable Clustering")

stability_M <- ClustOfVar::stability(
  tree = hclustvar_M,
  B = 100,
  graph = TRUE
)

### Hmisc::varclus()

varclust_spearman_wardD <- Hmisc::varclus(
  x = scale_M,
  similarity = "spearman",
  method = "ward.D"
)
plot(varclust_spearman_wardD)

Based on the three methods analyzed above, the ClustOfVar method is recommended. Specifically, cutting into 3 or 8 groups appears optimal according to the stability graph, which suggests these cluster numbers provide the best results. The Transpose method is not recommended, as the resulting clusters appear too disordered and lack clear separation.

Variable Selection

PCA

v_color <- viridis::viridis(4)
names(v_color) <- sort(
  x = unique(
    x = meter$Health_State_of_Meter
  )
)
v_color <- v_color[meter$Health_State_of_Meter]

D <- meter[,colnames(meter) != "Health_State_of_Meter"]
D <- scale(
  x = D
)
prcomp_meter <- prcomp(
  x = D,
  center = FALSE,
  scale. = FALSE
)
round(
  x = summary(
    object = prcomp_meter
  )$importance,
  digits = 2
)
##                         PC1  PC2  PC3  PC4  PC5  PC6  PC7  PC8  PC9 PC10 PC11
## Standard deviation     4.70 2.78 1.75 1.53 1.39 1.31 0.88 0.85 0.78 0.65 0.56
## Proportion of Variance 0.51 0.18 0.07 0.05 0.04 0.04 0.02 0.02 0.01 0.01 0.01
## Cumulative Proportion  0.51 0.69 0.77 0.82 0.86 0.90 0.92 0.94 0.95 0.96 0.97
##                        PC12 PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20 PC21 PC22
## Standard deviation     0.51 0.48 0.44 0.38 0.29 0.29 0.25 0.21 0.19 0.18 0.17
## Proportion of Variance 0.01 0.01 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## Cumulative Proportion  0.98 0.98 0.99 0.99 0.99 0.99 0.99 1.00 1.00 1.00 1.00
##                        PC23 PC24 PC25 PC26 PC27 PC28 PC29 PC30 PC31 PC32 PC33
## Standard deviation     0.14 0.13 0.11 0.09 0.08 0.07 0.05 0.05 0.04 0.03 0.03
## Proportion of Variance 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## Cumulative Proportion  1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
##                        PC34 PC35 PC36 PC37 PC38 PC39 PC40 PC41 PC42 PC43
## Standard deviation     0.02 0.02 0.02 0.01 0.01 0.01    0    0    0    0
## Proportion of Variance 0.00 0.00 0.00 0.00 0.00 0.00    0    0    0    0
## Cumulative Proportion  1.00 1.00 1.00 1.00 1.00 1.00    1    1    1    1
biplot(
  x = prcomp_meter
)

D <- meter[!rownames(meter) %in% c("169","170","172","173"),colnames(meter) != "Health_State_of_Meter"]
D <- scale(
  x = D
)
prcomp_meter <- prcomp(
  x = D,
  center = FALSE,
  scale. = FALSE
)
round(
  x = summary(
    object = prcomp_meter
  )$importance,
  digits = 2
)
##                         PC1  PC2  PC3  PC4  PC5  PC6  PC7  PC8  PC9 PC10 PC11
## Standard deviation     4.75 2.71 1.85 1.57 1.45 1.08 0.96 0.90 0.67 0.63 0.58
## Proportion of Variance 0.52 0.17 0.08 0.06 0.05 0.03 0.02 0.02 0.01 0.01 0.01
## Cumulative Proportion  0.52 0.69 0.77 0.83 0.88 0.91 0.93 0.95 0.96 0.97 0.97
##                        PC12 PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20 PC21 PC22
## Standard deviation     0.53 0.46 0.37 0.32 0.27 0.25 0.22  0.2 0.18 0.16 0.14
## Proportion of Variance 0.01 0.00 0.00 0.00 0.00 0.00 0.00  0.0 0.00 0.00 0.00
## Cumulative Proportion  0.98 0.99 0.99 0.99 0.99 0.99 1.00  1.0 1.00 1.00 1.00
##                        PC23 PC24 PC25 PC26 PC27 PC28 PC29 PC30 PC31 PC32 PC33
## Standard deviation     0.12 0.11  0.1 0.08 0.08 0.06 0.05 0.05 0.04 0.03 0.03
## Proportion of Variance 0.00 0.00  0.0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## Cumulative Proportion  1.00 1.00  1.0 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
##                        PC34 PC35 PC36 PC37 PC38 PC39 PC40 PC41 PC42 PC43
## Standard deviation     0.02 0.02 0.02 0.01 0.01 0.01    0    0    0    0
## Proportion of Variance 0.00 0.00 0.00 0.00 0.00 0.00    0    0    0    0
## Cumulative Proportion  1.00 1.00 1.00 1.00 1.00 1.00    1    1    1    1
plot(
  x = prcomp_meter,
  type = "l"
)

screeplot(
  x = prcomp_meter
)

biplot(
  x = prcomp_meter
)

factoextra::fviz_eig(
  X = prcomp_meter
)

factoextra::fviz_pca_ind(
  prcomp_meter,
  col.ind = "cos2",
  gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
  repel = TRUE
)

factoextra::fviz_pca_var(
  X = prcomp_meter,
  col.var = "contrib",
  gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
  repel = TRUE
)

factoextra::fviz_pca_biplot(
  X = prcomp_meter,
  repel = TRUE,
  col.var = "#2E9FDF",
  col.ind = "#696969"
)

pairs(prcomp_meter$x[,1:4],col = v_color,pch = 19)

The Scree Plot shows the first 2 or 3 dimensions capture the majority of the explained variance and are therefore the best to utilize.

v <- sort(prcomp_meter$rotation[,"PC1"])
v <- v[abs(v) > 0.15]
M <- matrix(v)
rownames(M) <- names(v)
M
##                   [,1]
## Signal_Quality5  -0.20
## Signal_Strength2 -0.20
## Signal_Strength1 -0.20
## Signal_Quality6  -0.19
## Signal_Strength6 -0.19
## Signal_Strength5 -0.19
## Signal_Strength4 -0.19
## Signal_Strength3 -0.19
## Signal_Quality7  -0.18
## Signal_Quality8  -0.18
## Signal_Strength7 -0.18
## Signal_Strength8 -0.18
## Signal_Quality1  -0.17
## Signal_Quality2  -0.16
## Flow_Velocity4   -0.16
## Transit_Time7    -0.16
## Speed_of_Sound4   0.16
## Gain8             0.18
## Gain7             0.18
## Gain4             0.18
## Gain3             0.18
## Gain1             0.19
## Gain2             0.19
## Gain6             0.20
## Gain5             0.20
par(las = 3)
barplot(prcomp_meter$rotation[,"PC1"])

Above we seek to find what variables are most significant when compared with our target Health_State_of_Meter. We found that the variables that are likely to be the most strongly associated with the target variable Health_State_of_Meter.Are Gain5 and Gain6, with coefficients of 0.20, followed by Signal_Quality5, Signal_Strength2, and Signal_Strength1, all with coefficients of -0.20.